Final Project - Analyzing Sales Data

Date: 12 September 2024

Author: Chonlaphon Chantararat (Phon lifetofree)

Course: Pandas Foundation

# import data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("sample-store.csv")
# preview top 5 rows
df.head()
# shape of dataframe
df.shape
(9994, 21)
# see data frame information using .info()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Row ID 9994 non-null int64
1 Order ID 9994 non-null object
2 Order Date 9994 non-null object
3 Ship Date 9994 non-null object
4 Ship Mode 9994 non-null object
5 Customer ID 9994 non-null object
6 Customer Name 9994 non-null object
7 Segment 9994 non-null object
8 Country/Region 9994 non-null object
9 City 9994 non-null object
10 State 9994 non-null object
11 Postal Code 9983 non-null float64
12 Region 9994 non-null object
13 Product ID 9994 non-null object
14 Category 9994 non-null object
15 Sub-Category 9994 non-null object
16 Product Name 9994 non-null object
17 Sales 9994 non-null float64
18 Quantity 9994 non-null int64
19 Discount 9994 non-null float64
20 Profit 9994 non-null float64
dtypes: float64(4), int64(2), object(15)
memory usage: 1.6+ MB

We can use pd.to_datetime() function to convert columns 'Order Date' and 'Ship Date' to datetime.

# example of pd.to_datetime() function
pd.to_datetime(df['Order Date'].head(), format='%m/%d/%Y')
TODO - convert order date and ship date to datetime in the original dataframe

df['Order Date'] = pd.to_datetime(df['Order Date'], format='%m/%d/%Y')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], format='%m/%d/%Y')

df.head(10)
TODO - count nan in postal code column

count_na = df['Postal Code'].isna().sum()

count_na
11
TODO - filter rows with missing values

df_filter = df.dropna()

df_filter
TODO - Explore this dataset on your owns, ask your own questions

# Question: how many order each date and segment

# count order by order date
df_order_date = df_filter.groupby([ 'Order Date''Segment' ]).size().reset_index(name='Count Order')

df_order_date

Data Analysis Part

Answer 10 below questions to get credit from this course. Write pandas code to find answers.

TODO 01 - how many columns, rows in this dataset

df.shape

# ans: 9994 rows, 21 columns
(9994, 21)
TODO 02 - is there any missing values?, if there is, which colunm? how many nan values?

df.isna().sum()

# ans: Postal Code 11 values
TODO 03 - your friend ask for `California` data, filter it and export csv for him

result = df.dropna().query('State == "California"').reset_index()

result.to_csv('result.csv')
TODO 04 - your friend ask for all order data in `California` and `Texas` in 2017 (look at Order Date), send him csv file

df_state = df.dropna().query('State == "California" | State == "Texas"')

order_year = df_state['Order Date'].dt.year == 2017

df_state[order_year].to_csv('result_orders_2017.csv')
TODO 05 - how much total sales, average sales, and standard deviation of sales your company make in 2017

order_year = df['Order Date'].dropna().dt.year == 2017

df[order_year]['Sales'].agg([ 'sum''mean''std' ])
TODO 06 - which Segment has the highest profit in 2018

order_year = df['Order Date'].dropna().dt.year == 2018
# sum profit by segment
df_profit = df[order_year].groupby('Segment')['Profit'].sum()
# find row has max profit
df_profit.nlargest(1)
TODO 07 - which top 5 States have the least total sales between 15 April 2019 - 31 December 2019

date_start = pd.to_datetime('2019-04-15')
date_end = pd.to_datetime('2019-12-31')

order_range = (df['Order Date'].dropna() >= date_start) & (df['Order Date'].dropna() <= date_end)

df_order_range = df.dropna().loc[order_range]

# sum sales by state
df_bottom_5 = df_order_range.groupby('State')['Sales'].sum()
# find rows bottom 5 sales
df_bottom_5.nsmallest(5)
TODO 08 - what is the proportion of total sales (%) in West + Central in 2019 e.g. 25% 

order_year = df['Order Date'].dropna().dt.year == 2019
df_2019 = df.loc[order_year]
total_sales_2019 = df_2019['Sales'].sum()

# filter regions 'West' and 'Central'
df_region = df_2019.query(' Region == "West" | Region == "Central" ')

# total sales for 'West' and 'Central'
total_sales_region = df_region['Sales'].sum()

# cal proportion of total sales for 'West' and 'Central'
sales_proportion = (total_sales_region / total_sales_2019) * 100
sales_proportion
54.97479891837763
TODO 09 - find top 10 popular products in terms of number of orders vs. total sales during 2019-2020

order_years = df['Order Date'].dropna().dt.year.isin([20192020])
df_years = df[order_years]

# group by product name, count orders, and sum sales
df_products = df_years.groupby('Product Name').agg({'Sales''sum','Order ID''count'}).reset_index()

# rename columns
df_products.columns = ['Product Name''Total Sales''Number of Orders']

# Sort first by 'Number of Orders' then by 'Total Sales'
df_products = df_products.sort_values(['Number of Orders''Total Sales'], ascending=[FalseFalse])

# find top 10 products
df_products.nlargest(10'Number of Orders')
TODO 10 - plot at least 2 plots, any plot you think interesting :)

df_clean = df.copy().dropna()
# check 'Order Date' is in datetime format. If not, convert it back to datetime format.
if df_clean['Order Date'].dtype != np.dtype('datetime64[ns]'):
    df_clean['Order Date'] = pd.to_datetime(df_clean['Order Date'], format='%m/%d/%Y')

## ---------- 1st plot ---------- ##
# create a new column for year and month
df_clean['YearMonth'] = df_clean['Order Date'].dt.to_period('M')
df_clean['Year'] = df_clean['Order Date'].dt.year
df_clean['Month'] = df_clean['Order Date'].dt.month

# group by the new 'YearMonth' column and sum sales
df_sale_month = df_clean.groupby([ 'Year''Month' ]).agg({'Sales''sum'}).reset_index()
df_sale_month['YearMonth'] = df_sale_month[ 'Year' ].astype(str) + "-" + df_sale_month[ 'Month' ].astype(str)

# create the line plot
plt.figure(figsize=(106))
for year in df_clean['Year'].unique():
    year_data = df_sale_month[df_sale_month['Year'] == year]
    plt.plot(year_data['Month'], year_data['Sales'], label=year)

plt.xlabel('Month')
plt.ylabel('Sales')
plt.title('Year Sales by Month')
plt.legend()
plt.show()

## ---------- 2nd plot ---------- ##
# group by 'Category' column and sum sales
df_sale_category = df_clean.groupby([ 'Year''Category' ]).agg({'Sales''sum'}).reset_index()

plt.figure(figsize=(106))
width = 0.25  # Set the width of the bars
for i, category in enumerate(df_sale_category['Category'].unique()):
    category_data = df_sale_category[df_sale_category['Category'] == category]
    plt.bar([x - (width / 2) + (i * width) for x in category_data['Year']], category_data['Sales'], width, label=category)

plt.xlabel('Year')
plt.ylabel('Sales')
plt.title('Year Sales by Category')
plt.xticks(df_sale_category['Year'])
plt.legend()
plt.show()
TODO Bonus - use np.where() to create new column in dataframe to help you answer your own questions

df_clean = df.copy().dropna()
# check 'Order Date' is in datetime format. If not, convert it back to datetime format.
if df_clean['Order Date'].dtype != np.dtype('datetime64[ns]'):
    df_clean['Order Date'] = pd.to_datetime(df_clean['Order Date'], format='%m/%d/%Y')

# select order year in 2017 in 'Category' is 'Technology'
result = np.where((df_clean['Order Date'].dt.year == 2017) & (df_clean['Category'] == 'Technology'))[0]
selected_rows = df.iloc[result]
selected_rows